import os
import logging
from typing import List,Tuple
from PyPDF2 import PdfReader

def extract_text_with_page_number(pdf) -> Tuple[str,List[int]]:
    """
        从PDF中提取文本并记录每行文本对应的页码，
        参数：
            pdf: PDF文件对象
        返回：
            text: 提取的文本内容
            page_numbers：每行文本对应的页码列表
    """
    text=""
    page_numbers=[]
    for page_number,page in enumerate(pdf.pages,start=1):
        extracted_text=page.extract_text()
        if extracted_text:
            text += extracted_text
            page_numbers.extend([page_number]* len(extracted_text.split("\n")))
        else:
            logging.warning(f"No text found on page {page_number}.")
    return text,page_numbers



if __name__=="__main__":
    pdf_path="/data/data_ldf/doc/Linux_centos7.4.pdf"
    #pdf_path="/data/data_ldf/doc/浦发上海浦东发展银行西安分行个金客户经理考核办法.pdf"
    pdf_reader=PdfReader(pdf_path)
    # 提取文本和页码信息
    text,page_numbers=extract_text_with_page_number(pdf_reader)
    text
    print(text)
    print(page_numbers)